library(tidyverse)
library(ggplot2)
library(stringr)
knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)Final Project
Challenge Overview
Today’s challenge is to:
- read in multiple data sets, and describe the data set using both words and any supporting information (e.g., tables, etc)
- tidy data (as needed, including sanity checks)
- mutate variables as needed (including sanity checks)
- join two or more data sets and analyze some aspect of the joined data
(be sure to only include the category tags for the data you use!)
Read in data
file_path <- "_data/_players/"
csv_file_names <- list.files(path="_data/_players/", pattern="players*")
csv_file_names[1] "players_15.csv" "players_16.csv" "players_17.csv" "players_18.csv"
[5] "players_19.csv" "players_20.csv"
Briefly describe the data
Tidy Data (as needed)
Is your data already tidy, or is there work to be done? Be sure to anticipate your end result to provide a sanity check, and document your work here.
csv_file_names %>%
purrr::map(function(file_name){
assign(x = str_remove(file_name, ".csv"),
value = read_csv(paste0(file_path, file_name)),
envir = .GlobalEnv)
})[[1]]
# A tibble: 15,465 × 104
sofifa_id player_url short…¹ long_…² age dob heigh…³ weigh…⁴ natio…⁵
<dbl> <chr> <chr> <chr> <dbl> <date> <dbl> <dbl> <chr>
1 158023 https://s… L. Mes… Lionel… 27 1987-06-24 169 67 Argent…
2 20801 https://s… Cristi… Cristi… 29 1985-02-05 185 80 Portug…
3 9014 https://s… A. Rob… Arjen … 30 1984-01-23 180 80 Nether…
4 41236 https://s… Z. Ibr… Zlatan… 32 1981-10-03 195 95 Sweden
5 167495 https://s… M. Neu… Manuel… 28 1986-03-27 193 92 Germany
6 176580 https://s… L. Suá… Luis A… 27 1987-01-24 181 81 Uruguay
7 183277 https://s… E. Haz… Eden H… 23 1991-01-07 173 74 Belgium
8 7826 https://s… R. van… Robin … 30 1983-08-06 187 71 Nether…
9 121944 https://s… B. Sch… Bastia… 29 1984-08-01 183 79 Germany
10 156616 https://s… F. Rib… Franck… 31 1983-04-07 170 72 France
# … with 15,455 more rows, 95 more variables: club <chr>, overall <dbl>,
# potential <dbl>, value_eur <dbl>, wage_eur <dbl>, player_positions <chr>,
# preferred_foot <chr>, international_reputation <dbl>, weak_foot <dbl>,
# skill_moves <dbl>, work_rate <chr>, body_type <chr>, real_face <chr>,
# release_clause_eur <lgl>, player_tags <chr>, team_position <chr>,
# team_jersey_number <dbl>, loaned_from <chr>, joined <date>,
# contract_valid_until <dbl>, nation_position <chr>, …
# ℹ Use `print(n = ...)` to see more rows, and `colnames()` to see all variable names
[[2]]
# A tibble: 14,881 × 104
sofifa_id player_url short…¹ long_…² age dob heigh…³ weigh…⁴ natio…⁵
<dbl> <chr> <chr> <chr> <dbl> <date> <dbl> <dbl> <chr>
1 158023 https://s… L. Mes… Lionel… 28 1987-06-24 170 72 Argent…
2 20801 https://s… Cristi… Cristi… 30 1985-02-05 185 80 Portug…
3 9014 https://s… A. Rob… Arjen … 31 1984-01-23 180 80 Nether…
4 167495 https://s… M. Neu… Manuel… 29 1986-03-27 193 92 Germany
5 176580 https://s… L. Suá… Luis A… 28 1987-01-24 182 85 Uruguay
6 183277 https://s… E. Haz… Eden H… 24 1991-01-07 173 74 Belgium
7 41236 https://s… Z. Ibr… Zlatan… 33 1981-10-03 195 95 Sweden
8 190871 https://s… Neymar Neymar… 23 1992-02-05 174 68 Brazil
9 164240 https://s… Thiago… Thiago… 30 1984-09-22 183 79 Brazil
10 168542 https://s… David … David … 29 1986-01-08 170 67 Spain
# … with 14,871 more rows, 95 more variables: club <chr>, overall <dbl>,
# potential <dbl>, value_eur <dbl>, wage_eur <dbl>, player_positions <chr>,
# preferred_foot <chr>, international_reputation <dbl>, weak_foot <dbl>,
# skill_moves <dbl>, work_rate <chr>, body_type <chr>, real_face <chr>,
# release_clause_eur <lgl>, player_tags <chr>, team_position <chr>,
# team_jersey_number <dbl>, loaned_from <chr>, joined <date>,
# contract_valid_until <dbl>, nation_position <chr>, …
# ℹ Use `print(n = ...)` to see more rows, and `colnames()` to see all variable names
[[3]]
# A tibble: 17,009 × 104
sofifa_id player_url short…¹ long_…² age dob heigh…³ weigh…⁴ natio…⁵
<dbl> <chr> <chr> <chr> <dbl> <date> <dbl> <dbl> <chr>
1 20801 https://s… Cristi… Cristi… 31 1985-02-05 185 80 Portug…
2 158023 https://s… L. Mes… Lionel… 29 1987-06-24 170 72 Argent…
3 190871 https://s… Neymar Neymar… 24 1992-02-05 174 68 Brazil
4 167495 https://s… M. Neu… Manuel… 30 1986-03-27 193 92 Germany
5 176580 https://s… L. Suá… Luis A… 29 1987-01-24 182 85 Uruguay
6 193080 https://s… De Gea David … 25 1990-11-07 193 82 Spain
7 41236 https://s… Z. Ibr… Zlatan… 34 1981-10-03 195 95 Sweden
8 173731 https://s… G. Bale Gareth… 26 1989-07-16 183 74 Wales
9 183907 https://s… J. Boa… Jérôme… 27 1988-09-03 192 90 Germany
10 188545 https://s… R. Lew… Robert… 27 1988-08-21 185 79 Poland
# … with 16,999 more rows, 95 more variables: club <chr>, overall <dbl>,
# potential <dbl>, value_eur <dbl>, wage_eur <dbl>, player_positions <chr>,
# preferred_foot <chr>, international_reputation <dbl>, weak_foot <dbl>,
# skill_moves <dbl>, work_rate <chr>, body_type <chr>, real_face <chr>,
# release_clause_eur <lgl>, player_tags <chr>, team_position <chr>,
# team_jersey_number <dbl>, loaned_from <chr>, joined <date>,
# contract_valid_until <dbl>, nation_position <chr>, …
# ℹ Use `print(n = ...)` to see more rows, and `colnames()` to see all variable names
[[4]]
# A tibble: 17,592 × 104
sofifa_id player_url short…¹ long_…² age dob heigh…³ weigh…⁴ natio…⁵
<dbl> <chr> <chr> <chr> <dbl> <date> <dbl> <dbl> <chr>
1 20801 https://s… Cristi… Cristi… 32 1985-02-05 185 80 Portug…
2 158023 https://s… L. Mes… Lionel… 30 1987-06-24 170 72 Argent…
3 190871 https://s… Neymar Neymar… 25 1992-02-05 175 68 Brazil
4 167495 https://s… M. Neu… Manuel… 31 1986-03-27 193 92 Germany
5 176580 https://s… L. Suá… Luis A… 30 1987-01-24 182 86 Uruguay
6 188545 https://s… R. Lew… Robert… 28 1988-08-21 185 79 Poland
7 193080 https://s… De Gea David … 26 1990-11-07 193 76 Spain
8 183277 https://s… E. Haz… Eden H… 26 1991-01-07 173 76 Belgium
9 155862 https://s… Sergio… Sergio… 31 1986-03-30 183 75 Spain
10 167664 https://s… G. Hig… Gonzal… 29 1987-12-10 184 87 Argent…
# … with 17,582 more rows, 95 more variables: club <chr>, overall <dbl>,
# potential <dbl>, value_eur <dbl>, wage_eur <dbl>, player_positions <chr>,
# preferred_foot <chr>, international_reputation <dbl>, weak_foot <dbl>,
# skill_moves <dbl>, work_rate <chr>, body_type <chr>, real_face <chr>,
# release_clause_eur <dbl>, player_tags <chr>, team_position <chr>,
# team_jersey_number <dbl>, loaned_from <chr>, joined <date>,
# contract_valid_until <dbl>, nation_position <chr>, …
# ℹ Use `print(n = ...)` to see more rows, and `colnames()` to see all variable names
[[5]]
# A tibble: 17,770 × 104
sofifa_id player_url short…¹ long_…² age dob heigh…³ weigh…⁴ natio…⁵
<dbl> <chr> <chr> <chr> <dbl> <date> <dbl> <dbl> <chr>
1 20801 https://s… Cristi… Cristi… 33 1985-02-05 187 83 Portug…
2 158023 https://s… L. Mes… Lionel… 31 1987-06-24 170 72 Argent…
3 190871 https://s… Neymar… Neymar… 26 1992-02-05 175 68 Brazil
4 193080 https://s… De Gea David … 27 1990-11-07 193 76 Spain
5 192985 https://s… K. De … Kevin … 27 1991-06-28 181 70 Belgium
6 155862 https://s… Sergio… Sergio… 32 1986-03-30 184 82 Spain
7 176580 https://s… L. Suá… Luis A… 31 1987-01-24 182 86 Uruguay
8 177003 https://s… L. Mod… Luka M… 32 1985-09-09 172 66 Croatia
9 183277 https://s… E. Haz… Eden H… 27 1991-01-07 173 74 Belgium
10 200389 https://s… J. Obl… Jan Ob… 25 1993-01-07 188 87 Sloven…
# … with 17,760 more rows, 95 more variables: club <chr>, overall <dbl>,
# potential <dbl>, value_eur <dbl>, wage_eur <dbl>, player_positions <chr>,
# preferred_foot <chr>, international_reputation <dbl>, weak_foot <dbl>,
# skill_moves <dbl>, work_rate <chr>, body_type <chr>, real_face <chr>,
# release_clause_eur <dbl>, player_tags <chr>, team_position <chr>,
# team_jersey_number <dbl>, loaned_from <chr>, joined <date>,
# contract_valid_until <dbl>, nation_position <chr>, …
# ℹ Use `print(n = ...)` to see more rows, and `colnames()` to see all variable names
[[6]]
# A tibble: 18,278 × 104
sofifa_id player_url short…¹ long_…² age dob heigh…³ weigh…⁴ natio…⁵
<dbl> <chr> <chr> <chr> <dbl> <date> <dbl> <dbl> <chr>
1 158023 https://s… L. Mes… Lionel… 32 1987-06-24 170 72 Argent…
2 20801 https://s… Cristi… Cristi… 34 1985-02-05 187 83 Portug…
3 190871 https://s… Neymar… Neymar… 27 1992-02-05 175 68 Brazil
4 200389 https://s… J. Obl… Jan Ob… 26 1993-01-07 188 87 Sloven…
5 183277 https://s… E. Haz… Eden H… 28 1991-01-07 175 74 Belgium
6 192985 https://s… K. De … Kevin … 28 1991-06-28 181 70 Belgium
7 192448 https://s… M. ter… Marc-A… 27 1992-04-30 187 85 Germany
8 203376 https://s… V. van… Virgil… 27 1991-07-08 193 92 Nether…
9 177003 https://s… L. Mod… Luka M… 33 1985-09-09 172 66 Croatia
10 209331 https://s… M. Sal… Mohame… 27 1992-06-15 175 71 Egypt
# … with 18,268 more rows, 95 more variables: club <chr>, overall <dbl>,
# potential <dbl>, value_eur <dbl>, wage_eur <dbl>, player_positions <chr>,
# preferred_foot <chr>, international_reputation <dbl>, weak_foot <dbl>,
# skill_moves <dbl>, work_rate <chr>, body_type <chr>, real_face <chr>,
# release_clause_eur <dbl>, player_tags <chr>, team_position <chr>,
# team_jersey_number <dbl>, loaned_from <chr>, joined <date>,
# contract_valid_until <dbl>, nation_position <chr>, …
# ℹ Use `print(n = ...)` to see more rows, and `colnames()` to see all variable names
Are there any variables that require mutation to be usable in your analysis stream? For example, do you need to calculate new values in order to graph them? Can string values be represented numerically? Do you need to turn any variables into factors and reorder for ease of graphics and visualization?
Document your work here.
players_15 <- players_15 %>%
mutate(fifa_type = "15") %>%
select(-c(player_url, long_name, body_type, real_face,release_clause_eur,
player_tags,team_position, team_jersey_number, loaned_from, nation_position,nation_jersey_number,
player_traits,mentality_composure,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,ram,lm,lcm,cm,rcm,rm,lwb,ldm,cdm,rdm,
rwb,lb,lcb,cb,rcb,rb))
players_16 <-players_16 %>%
mutate(fifa_type = "16") %>%
select(-c(player_url, long_name, body_type, real_face,release_clause_eur,
player_tags,team_position, team_jersey_number, loaned_from, nation_position,nation_jersey_number,
player_traits,mentality_composure,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,ram,lm,lcm,cm,rcm,rm,lwb,ldm,cdm,rdm,
rwb,lb,lcb,cb,rcb,rb))
players_17<- players_17 %>%
mutate(fifa_type = "17") %>%
select(-c(player_url, long_name, body_type, real_face,release_clause_eur,
player_tags,team_position, team_jersey_number, loaned_from, nation_position,nation_jersey_number,
player_traits,mentality_composure,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,ram,lm,lcm,cm,rcm,rm,lwb,ldm,cdm,rdm,
rwb,lb,lcb,cb,rcb,rb))
players_18 <- players_18 %>%
mutate(fifa_type = "18") %>%
select(-c(player_url, long_name, body_type, real_face,release_clause_eur,
player_tags,team_position, team_jersey_number, loaned_from, nation_position,nation_jersey_number,
player_traits,mentality_composure,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,ram,lm,lcm,cm,rcm,rm,lwb,ldm,cdm,rdm,
rwb,lb,lcb,cb,rcb,rb))
players_19<- players_19 %>%
mutate(fifa_type = "19") %>%
select(-c(player_url, long_name, body_type, real_face,release_clause_eur,
player_tags,team_position, team_jersey_number, loaned_from, nation_position,nation_jersey_number,
player_traits,mentality_composure,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,ram,lm,lcm,cm,rcm,rm,lwb,ldm,cdm,rdm,
rwb,lb,lcb,cb,rcb,rb))
players_20<- players_20 %>%
mutate(fifa_type = "20") %>%
select(-c(player_url, long_name, body_type, real_face,release_clause_eur,
player_tags,team_position, team_jersey_number, loaned_from, nation_position,nation_jersey_number,
player_traits,mentality_composure,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,ram,lm,lcm,cm,rcm,rm,lwb,ldm,cdm,rdm,
rwb,lb,lcb,cb,rcb,rb))Join Data
Be sure to include a sanity check, and double-check that case count is correct!
players_15players_16players_17players_18players_19players_20vars_mut<- players_15%>%
select(-c(sofifa_id, short_name, age, dob, height_cm, weight_kg, nationality, club , overall,
potential, value_eur, wage_eur, player_positions, preferred_foot, international_reputation
,weak_foot,skill_moves, work_rate, joined ,contract_valid_until, pace, shooting, passing
,dribbling, defending, physic, gk_diving, gk_handling, gk_kicking ,gk_reflexes,
gk_speed,gk_positioning, fifa_type))
col_var_mut <- colnames(vars_mut)
#players_15sub.var <- function(x, na.rm=FALSE) (str_sub(x,start = 1, end = 2))
players_15_tidy<- players_15%>%
mutate_at(col_var_mut, sub.var) %>%
mutate_at(col_var_mut, as.double)
players_16_tidy <- players_16%>%
mutate_at(col_var_mut, sub.var) %>%
mutate_at(col_var_mut, as.double)
players_17_tidy <- players_17%>%
mutate_at(col_var_mut, sub.var) %>%
mutate_at(col_var_mut, as.double)
players_18_tidy <- players_18%>%
mutate_at(col_var_mut, sub.var) %>%
mutate_at(col_var_mut, as.double)
players_19_tidy <- players_19%>%
mutate_at(col_var_mut, sub.var) %>%
mutate_at(col_var_mut, as.double)here i scraped the characters off all stats.
#players_15_tidy
#players_16_tidy
#players_17_tidy
#players_18_tidy
#players_19_tidy
#players_20FIFA<- full_join(players_15_tidy,players_16_tidy) %>%
full_join(players_17_tidy) %>%
full_join(players_18_tidy) %>%
full_join(players_19_tidy) %>%
full_join(players_20)
# FIFAsub.var1 <- function(x, na.rm=FALSE) (str_sub(x,start = 1, end = 3))
vis1<- FIFA %>%
select(fifa_type,player_positions, height_cm) %>%
group_by(fifa_type, player_positions) %>%
mutate_at("player_positions",sub.var1)%>%
summarise(avg = mean(height_cm)) %>%
mutate(player_positions = str_replace(player_positions, "[:punct:]", "+"))
vis1ggplot(vis1, aes(x=fifa_type, y=avg, group=player_positions, color =player_positions )) +
geom_point(size=1) +
geom_line() +
theme(legend.position = "right") +
labs(title = "Average Height of all postions Over Years",
x = "Year of FIFA", y = "Average Height") +
theme_bw()
FIFA_tidy<- FIFA %>%
pivot_longer(col =-c(sofifa_id, short_name, age, dob, height_cm, weight_kg, nationality, club , overall,
potential, value_eur, wage_eur, player_positions, preferred_foot, international_reputation
,weak_foot,skill_moves, work_rate, joined ,contract_valid_until, fifa_type) , names_to = "stat",
values_to = "value", values_drop_na = TRUE )
#FIFA_tidypossible ideas:
Historical comparison between Messi and Ronaldo (what skill attributes changed the most during time - compared to real-life stats);
Ideal budget to create a competitive team (at the level of top n teams in Europe) and at which point the budget does not allow to buy significantly better players for the 11-men lineup. An extra is the same comparison with the Potential attribute for the lineup instead of the Overall attribute;
Sample analysis of top n% players (e.g. top 5% of the player) to see if some important attributes as Agility or BallControl or Strength have been popular or not acroos the FIFA versions. An example would be seeing that the top 5% players of FIFA 20 are more fast (higher Acceleration and Agility) compared to FIFA 15. The trend of attributes is also an important indication of how some attributes are necessary for players to win games (a version with more top 5% players with high BallControl stats would indicate that the game is more focused on the technique rather than the physicial aspect).
height of cbs over the years
FIFA_tidy %>%
select(fifa_type,player_positions, height_cm) %>%
filter(player_positions == "CB")